home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Software Vault: The Gold Collection
/
Software Vault - The Gold Collection (American Databankers) (1993).ISO
/
cdr46
/
strx221.zip
/
MATCH.CPP
< prev
next >
Wrap
C/C++ Source or Header
|
1993-03-15
|
15KB
|
515 lines
/*
*
* Author: Allen I. Holub
*
* (c) C Gazette. May be used freely as long as author and publication are
* acknowledged
*
* Roy S. Woll (Revision 2.0)
* 1032 Summerplace Dr.
* San Jose, CA 95122
*
* ----------------------------------------------------------------------
*
*
* Revision 2.02 14 Mar 1993 ROY S. WOLL
*
* Fixed octal set definition to include first octel digit
*
* Revision 2.0 16 Nov 1992 ROY S. WOLL
*
* Initial revision for match.c -> match.cpp
* Compatibility with C++ syntax, and now member functions of regX.h
* to avoid polluting the name space.
* Fixed some inconsistencies. Regular expression compiled pattern should
* now grow as needed. Case insensitive support.
*
* Revision 1 27 Jan 1991 Allen I. Holub
*
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "regximp.h"
inline const char * max(const char * x, const char * y)
{if (x>y) return x; else return y;}
/* Metacharacters in the input: */
#define BOL '^' /* start-of-line anchor */
#define EOL '$' /* end-of-line anchor */
#define ANY '.' /* matches any character */
#define CCL '[' /* start a character class */
#define CCLEND ']' /* end a character class */
#define NCCL '^' /* negates character class if 1st char. */
#define CLOSURE '*' /* Kleene closure (matches 0 or more) */
#define PCLOSE '+' /* Positive closure (1 or more) */
#define OPT '?' /* Optional closure (0 or 1) */
typedef enum action { // These are put in the pattern string
// to represent metacharacters.
M_BOL = (0x80 | '^'),
M_EOL = (0x80 | '$'),
M_ANY = (0x80 | '.'),
M_CCL = (0x80 | '['),
M_OPT = (0x80 | '?'),
M_CLOSE = (0x80 | '*'),
M_PCLOSE = (0x80 | '+')
} action;
typedef unsigned char pattern; /* pattern strings are unsigned char */
#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
/* action that represents a metacharacter */
/*----------------------------------------------------------------------*/
#define MAPSIZE 16 /* need this many bytes for character class bit map */
/*
* Advance a pointer into the pattern template
* to the next pattern element, this is a +1 for
* all pattern elements but M_CCL, where you
* to skip past both the M_CCL character and the
* bitmap that follows that character
*/
#define ADVANCE(pat) (pat += (*pat == (pattern)M_CCL) ? (MAPSIZE+1) : 1)
//
// Bitmap functions. Set bit b in the map and
// test bit b to see if it was set previously.
//
#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) )
#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
int regXimp::omatch(const char ** strp, const pattern * pat,
const char * start)
{
/*
* Match one pattern element, pointed at by pat, against the character at
* **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
* over the matched character on a successful match. Closure is handled one
* level up by patcmp().
*
* "start" points at the character at the left edge of the line. This might
* not be the same thing as *strp if the search is starting in the middle
* of the string. An end-of- line anchor matches '\n' or '\0'.
*/
int advance = -1; // amount to advance *strp, -1 == error
switch (*pat) {
case M_BOL: // First char in string?
if (*strp == start) // Only one star here.
advance = 0;
break;
case M_ANY: // . = anything but newline
if (**strp != '\n') advance = 1;
break;
case M_EOL:
if (**strp == '\n' || **strp == '\0')
advance = 0;
break;
case M_CCL:
if (TSTBIT(**strp, pat + 1)) advance = 1;
break;
default: /* literal match */
if (caseSensitive){
if (**strp == *pat) advance = 1;
}
else if (toupper(**strp) == toupper(*pat)) advance = 1;
break;
}
if (advance > 0)
*strp += advance;
return (advance + 1);
}
#define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
static int hex2bin(int c)
{
/* Convert the hex digit represented by 'c' to an int. 'c'
* must be one of: 0123456789abcdefABCDEF
*/
return (isdigit(c) ? (c)-'0': ((toupper(c))-'A')+10) & 0xf;
}
static int oct2bin(int c)
{
/* Convert the hex digit represented by 'c' to an int. 'c'
* must be a digit in the range '0'-'7'.
*/
return ( ((c)-'0') & 0x7 );
}
/*------------------------------------------------------------*/
int esc(const char **s)
{
/* Map escape sequences into their equivalent symbols. Return
* the equivalent ASCII character. *s is advanced past the
* escape sequence. If no escape sequence is present, the
* current character is returned and the string is advanced by
* one. The following are recognized:
*
* \b backspace
* \f formfeed
* \n newline
* \r carriage return
* \s space
* \t tab
* \e ASCII ESC character ('\033')
* \DDD number formed of 1-3 octal digits
* \xDDD number formed of 1-3 hex digits
* \^C C = any letter. Control code
*/
int rval;
if( **s != '\\' )
rval = *( (*s)++ );
else {
++(*s); // Skip the '\'
switch( toupper(**s) ) {
case '\0': rval = '\\'; break;
case 'B': rval = '\b' ; break;
case 'F': rval = '\f' ; break;
case 'N': rval = '\n' ; break;
case 'R': rval = '\r' ; break;
case 'S': rval = ' ' ; break;
case 'T': rval = '\t' ; break;
case 'E': rval = '\033'; break;
case '^':
rval = *++(*s) ;
rval = toupper(rval) - '@' ;
break;
case 'X':
rval = 0;
++(*s);
if( isxdigit(**s) ) {
rval = hex2bin( *(*s)++ );
}
if( isxdigit(**s) ) {
rval <<= 4;
rval |= hex2bin( *(*s)++ );
}
if( isxdigit(**s) ) {
rval <<= 4;
rval |= hex2bin( *(*s)++ );
}
--(*s);
break;
default:
if( !ISOCTDIGIT(**s) )
rval = **s;
else {
rval = oct2bin( *(*s)++ );
if( ISOCTDIGIT(**s) ) {
rval <<= 3;
rval |= oct2bin( *(*s)++ );
}
if( ISOCTDIGIT(**s) ) {
rval <<= 3;
rval |= oct2bin( *(*s)++ );
}
--(*s);
}
break;
}
++(*s);
}
return rval;
}
/*----------------------------------------------------------------------*/
const char * regXimp::doccl(const char * src)
{
/*
* Set bits in the map corresponding to characters specified in the src
* character class.
*/
int first, last, negative;
++src; // skip past the [
negative = (*src == NCCL);
if (negative) ++src; // check for negative ccl
const char * start = src; // start of characters in class
int len = compiledPattern.length();
compiledPattern.pad(len+MAPSIZE, str::right, char(0));
char * map = (char *)compiledPattern(len);
while (*src && *src != CCLEND) {
if (*src != '-') {
first = esc(&src); // Use temp. to avoid macro
// side effects.
SETBIT(first, map);
} else if (src == start) {
SETBIT('-', map); // literal dash at start or end
++src;
} else {
++src; // skip to end-of-sequence char
// Support special characters within [].
last = esc(&src);
if (last < first){
int temp=first;
first = last;
last = temp;
};
while (++first <= last) SETBIT(first, map);
}
}
if (*src == CCLEND) ++src; // Skip CCLEND
if (negative)
for (first = MAPSIZE; --first >= 0;)
*map++ ^= ~0; /* invert all bits */
return src;
}
/*----------------------------------------------------------------------*/
const char * regXimp::patcmp(const char * sstr, const pattern * pat,
const char * start)
{
/*
* Like strcmp, but compares str against pat. Each element of str is
* compared with the template until either a mis-match is found or the end
* of the template is reached. In the former case a 0 is returned; in the
* latter, a pointer into str (pointing to the last character in the
* matched pattern) is returned. Strstart points at the first character in
* the string, which might not be the same thing as line if the search
* started in the middle of the string.
*/
const char *bocl; // beginning of closure string.
const char *end=0; // return value: end-of-string pointer.
if (!pat) return (NULL); // make sure pattern is valid
while (*pat) {
if (*pat == (pattern)M_OPT) {
/*
* Zero or one matches. It doesn't matter if omatch fails---it will
* advance str past the character on success, though. Always advance
* the pattern past both the M_OPT and the operand.
*/
omatch(&sstr, ++pat, start);
ADVANCE(pat);
} else if (!(*pat == (pattern)M_CLOSE || *pat == (pattern)M_PCLOSE)) {
//
// Do a simple match. Note that omatch() fails if there's still
// something in pat but we're at end of string.
//
if (!omatch(&sstr, pat, start)) return NULL;
ADVANCE(pat);
} else { // Process a Kleene or positive closure
if (*pat++ == (pattern)M_PCLOSE) // one match required
if (!omatch(&sstr, pat, start)) return NULL;
// Match as many as possible, zero is okay
bocl = sstr;
while (*sstr && omatch(&sstr, pat, start));
/*
* 'str' now points to the character that made made us fail. Try to
* process the rest of the string. If the character following the
* closure could have been in the closure (as in the pattern "[a-z]*t")
* the final 't' will be sucked up in the while loop. So, if the match
* fails, back up a notch and try to match the rest of the string
* again, repeating this process recursively until we get back to the
* beginning of the closure. The recursion goes, at most, one levels
* deep.
*/
if (*ADVANCE(pat)) {
do {
end = patcmp(sstr, pat, start);
if (end) break;
sstr--;
} while (bocl <= sstr);
return end;
}
break;
};
}
/*
* omatch() advances str to point at the next character to be matched. So
* str points at the character following the last character matched when
* you reach the end of the template. The exceptions are templates
* containing only a BOLN or EOLN token. In these cases omatch doesn't
* advance. Since we must return a pointer to the last matched character,
* decrement str to make it point at the end of the matched string, making
* sure that the decrement hasn't gone past the beginning of the string.
*
* Note that $ is a position, not a character, but in the case of a pattern
* ^$, a pointer to the end of line character is returned. In ^xyz$, a
* pointer to the z is returned.
*
* The --str is done outside the return statement because max() is a macro
* with side-effects.
*/
--sstr;
return (max(start, sstr));
}
/*----------------------------------------------------------------------*/
int regXimp::makepat(const char * exp)
{
/*
* Make a pattern template from the string pointed to by exp.
* The pattern template is assembled in str compiledPattern.
* Regular expression can contain one or more \n characters.
*
* Return 1 if success
*/
error = 1;
if (!*exp) goto exit;
if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT) goto exit;
int prev, len;
len=0;
while (*exp ) {
switch (*exp) {
case ANY:
prev=len++;
compiledPattern += (pattern)M_ANY;
++exp;
break;
case BOL:
prev=len++;
compiledPattern += (!compiledPattern.length()) ? (pattern)M_BOL : *exp;
++exp;
break;
case EOL:
prev=len++;
compiledPattern += (!exp[1] || exp[1] == '\n') ? (pattern)M_EOL : *exp;
++exp;
break;
case CCL:
prev=len++;
compiledPattern += (pattern)M_CCL;
exp = doccl(exp);
len += MAPSIZE;
break;
case OPT:
case CLOSURE:
case PCLOSE:
switch (compiledPattern[prev]) {
case M_BOL:
case M_EOL:
case M_OPT:
case M_PCLOSE:
case M_CLOSE:
goto exit;
}
compiledPattern.insert(prev,
(*exp == OPT) ? (pattern)M_OPT :
(*exp == PCLOSE) ? (pattern)M_PCLOSE : (pattern)M_CLOSE);
len++;
++exp;
break;
default:
prev=len++;
compiledPattern += esc(&exp);
break;
}
}
error = 0;
exit:
return (!error);
}
/*----------------------------------------------------------------------*/
const char * regXimp::matchs(const char * sstr, int p_case)
{
const char *endp = NULL;
const char *start;
const unsigned char * pat = (unsigned char *)compiledPattern();
caseSensitive = p_case;
if (!pat) return NULL;
if (*sstr == '\0') {
if ((*pat == (pattern)M_EOL) ||
(*pat == (pattern)M_BOL && (!pat[1] || pat[1] == (pattern)M_EOL)))
{
endp = sstr;
endMatch = startMatch = sstr;
}
}
else {
start = sstr; // Do a brute-force substring search,
// comparing a pattern against the input string
while (*sstr) {
endp = patcmp(sstr, pat, start);
if (endp) {
endMatch = endp;
startMatch = sstr;
break;
};
sstr++;
}
}
return endp;
}